amex-dataset-EDA-plot¶

  • code reference : https://www.kaggle.com/code/cdeotte/time-series-eda
In [1]:
# LOAD LIBRARIES
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec

# LOAD TRAIN DATA AND MERGE TARGETS ONTO FEATURES
#df = pd.read_csv('./amex-default-prediction/train_data.csv', nrows=100_0000)
#df.S_2 = pd.to_datetime(df.S_2)
#df2 = pd.read_csv('./amex-default-prediction/train_labels.csv')
df = pd.read_parquet('./amex-parquet/train.parquet')
df.S_2 = pd.to_datetime(df.S_2)
df2 = pd.read_csv('./amex-default-prediction/train_labels.csv')
df = df.merge(df2,on='customer_ID',how='left')
In [2]:
def plot_time_series(prefix='D', cols=None, display_ct=32):
    
    # DETERMINE WHICH COLUMNS TO PLOT
    if cols is not None and len(cols)==0: cols = None
    if cols is None:
        COLS = df.columns[2:-1]
        COLS = np.sort( [int(x[2:]) for x in COLS if x[0]==prefix] )
        COLS = [f'{prefix}_{x}' for x in COLS]
        print('#'*25)
        print(f'Plotting all {len(COLS)} columns with prefix {prefix}')
        print('#'*25)
    else:
        COLS = [f'{prefix}_{x}' for x in cols]
        print('#'*25)
        print(f'Plotting {len(COLS)} columns with prefix {prefix}')
        print('#'*25)

    # ITERATE COLUMNS
    for c in COLS:

        # CONVERT DATAFRAME INTO SERIES WITH COLUMN
        tmp = df[['customer_ID','S_2',c,'target']].copy()
        tmp2 = tmp.groupby(['customer_ID','target'])[['S_2',c]].agg(list).reset_index()
        tmp3 = tmp2.loc[tmp2.target==1]
        tmp4 = tmp2.loc[tmp2.target==0]

        # FORMAT PLOT
        spec = gridspec.GridSpec(ncols=2, nrows=1,
                             width_ratios=[3, 1], wspace=0.1,
                             hspace=0.5, height_ratios=[1])
        fig = plt.figure(figsize=(20,10))
        ax0 = fig.add_subplot(spec[0])

        # PLOT 32 DEFAULT CUSTOMERS AND 32 NON-DEFAULT CUSTOMERS
        t0 = []; t1 = []
        for k in range(display_ct):
            try:
                # PLOT DEFAULTING CUSTOMERS
                row = tmp3.iloc[k]
                ax0.plot(row.S_2,row[c],'-o',color='blue')
                t1 += row[c]
                # PLOT NON-DEFAULT CUSTOMERS
                row = tmp4.iloc[k]
                ax0.plot(row.S_2,row[c],'-o',color='orange')
                t0 += row[c]
            except:
                pass
        plt.title(f'Feature {c} (Key: BLUE=DEFAULT, orange=no default)',size=18)

        # PLOT HISTOGRAMS
        ax1 = fig.add_subplot(spec[1])
        try:
            # COMPUTE BINS
            t = t0+t1; mn = np.nanmin(t); mx = np.nanmax(t)
            if mx==mn:
                mx += 0.01; mn -= 0.01
            bins = np.arange(mn,mx+(mx-mn)/20,(mx-mn)/20 )
            # PLOT HISTOGRAMS
            if np.sum(np.isnan(t1))!=len(t1):
                ax1.hist(t1,bins=bins,orientation="horizontal",alpha = 0.8,color='blue')
            if np.sum(np.isnan(t0))!=len(t0):
                ax1.hist(t0,bins=bins,orientation="horizontal",alpha = 0.8,color='orange')
        except:
            pass
        plt.show()
In [3]:
D_Feature_List = [48,54,55,58,61,62,74,75,77,80,115,118,119]
# S_Feature_List = All
# P_Feature_List = All
B_Feature_List = [1,3,4,7,9,19,20,23,28,33,37,40]
R_Feature_List = [9,10,26,27]

Plot Delinquency Variables¶

In [4]:
plot_time_series('D',D_Feature_List)
#########################
Plotting 13 columns with prefix D
#########################

Plot Spend Variables¶

In [5]:
plot_time_series('S')
#########################
Plotting all 21 columns with prefix S
#########################
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
c:\Users\minda\Documents\AI_project\amex-project(EDA)_Plot_Picked.ipynb 셀 8 in <cell line: 1>()
----> <a href='vscode-notebook-cell:/c%3A/Users/minda/Documents/AI_project/amex-project%28EDA%29_Plot_Picked.ipynb#X10sZmlsZQ%3D%3D?line=0'>1</a> plot_time_series('S')

c:\Users\minda\Documents\AI_project\amex-project(EDA)_Plot_Picked.ipynb 셀 8 in plot_time_series(prefix, cols, display_ct)
     <a href='vscode-notebook-cell:/c%3A/Users/minda/Documents/AI_project/amex-project%28EDA%29_Plot_Picked.ipynb#X10sZmlsZQ%3D%3D?line=18'>19</a> for c in COLS:
     <a href='vscode-notebook-cell:/c%3A/Users/minda/Documents/AI_project/amex-project%28EDA%29_Plot_Picked.ipynb#X10sZmlsZQ%3D%3D?line=19'>20</a> 
     <a href='vscode-notebook-cell:/c%3A/Users/minda/Documents/AI_project/amex-project%28EDA%29_Plot_Picked.ipynb#X10sZmlsZQ%3D%3D?line=20'>21</a>     # CONVERT DATAFRAME INTO SERIES WITH COLUMN
     <a href='vscode-notebook-cell:/c%3A/Users/minda/Documents/AI_project/amex-project%28EDA%29_Plot_Picked.ipynb#X10sZmlsZQ%3D%3D?line=21'>22</a>     tmp = df[['customer_ID','S_2',c,'target']].copy()
---> <a href='vscode-notebook-cell:/c%3A/Users/minda/Documents/AI_project/amex-project%28EDA%29_Plot_Picked.ipynb#X10sZmlsZQ%3D%3D?line=22'>23</a>     tmp2 = tmp.groupby(['customer_ID','target'])[['S_2',c]].agg(list).reset_index()
     <a href='vscode-notebook-cell:/c%3A/Users/minda/Documents/AI_project/amex-project%28EDA%29_Plot_Picked.ipynb#X10sZmlsZQ%3D%3D?line=23'>24</a>     tmp3 = tmp2.loc[tmp2.target==1]
     <a href='vscode-notebook-cell:/c%3A/Users/minda/Documents/AI_project/amex-project%28EDA%29_Plot_Picked.ipynb#X10sZmlsZQ%3D%3D?line=24'>25</a>     tmp4 = tmp2.loc[tmp2.target==0]

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\groupby\generic.py:883, in DataFrameGroupBy.aggregate(self, func, engine, engine_kwargs, *args, **kwargs)
    878 if result is None:
    879 
    880     # grouper specific aggregations
    881     if self.grouper.nkeys > 1:
    882         # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
--> 883         return self._python_agg_general(func, *args, **kwargs)
    884     elif args or kwargs:
    885         # test_pass_args_kwargs gets here (with and without as_index)
    886         # can't return early
    887         result = self._aggregate_frame(func, *args, **kwargs)

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\groupby\groupby.py:1490, in GroupBy._python_agg_general(self, func, *args, **kwargs)
   1486 name = obj.name
   1488 try:
   1489     # if this function is invalid for this dtype, we will ignore it.
-> 1490     result = self.grouper.agg_series(obj, f)
   1491 except TypeError:
   1492     warn_dropping_nuisance_columns_deprecated(type(self), "agg")

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\groupby\ops.py:972, in BaseGrouper.agg_series(self, obj, func, preserve_dtype)
    969     result = self._aggregate_series_pure_python(obj, func)
    971 elif not isinstance(obj._values, np.ndarray):
--> 972     result = self._aggregate_series_pure_python(obj, func)
    974     # we can preserve a little bit more aggressively with EA dtype
    975     #  because maybe_cast_pointwise_result will do a try/except
    976     #  with _from_sequence.  NB we are assuming here that _from_sequence
    977     #  is sufficiently strict that it casts appropriately.
    978     preserve_dtype = True

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\groupby\ops.py:1003, in BaseGrouper._aggregate_series_pure_python(self, obj, func)
   1000 # equiv: splitter = self._get_splitter(obj, axis=0)
   1001 splitter = get_splitter(obj, ids, ngroups, axis=0)
-> 1003 for i, group in enumerate(splitter):
   1004     group = group.__finalize__(obj, method="groupby")
   1005     res = func(group)

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\groupby\ops.py:1233, in DataSplitter.__iter__(self)
   1230 starts, ends = lib.generate_slices(self.slabels, self.ngroups)
   1232 for start, end in zip(starts, ends):
-> 1233     yield self._chop(sdata, slice(start, end))

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\groupby\ops.py:1246, in SeriesSplitter._chop(self, sdata, slice_obj)
   1244 def _chop(self, sdata: Series, slice_obj: slice) -> Series:
   1245     # fastpath equivalent to `sdata.iloc[slice_obj]`
-> 1246     mgr = sdata._mgr.get_slice(slice_obj)
   1247     # __finalize__ not called here, must be applied by caller if applicable
   1248     return sdata._constructor(mgr, name=sdata.name, fastpath=True)

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\internals\managers.py:1827, in SingleBlockManager.get_slice(self, slobj, axis)
   1824     raise IndexError("Requested axis not found in manager")
   1826 blk = self._block
-> 1827 array = blk._slice(slobj)
   1828 bp = BlockPlacement(slice(0, len(array)))
   1829 block = type(blk)(array, placement=bp, ndim=1)

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\internals\blocks.py:313, in Block._slice(self, slicer)
    310 def _slice(self, slicer) -> ArrayLike:
    311     """return a slice of my values"""
--> 313     return self.values[slicer]

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\arrays\datetimelike.py:341, in DatetimeLikeArrayMixin.__getitem__(self, key)
    333 """
    334 This getitem defers to the underlying array, which by-definition can
    335 only handle list-likes, slices, and integer scalars
    336 """
    337 # Use cast as we know we will get back a DatetimeLikeArray or DTScalar,
    338 # but skip evaluating the Union at runtime for performance
    339 # (see https://github.com/pandas-dev/pandas/pull/44624)
    340 result = cast(
--> 341     "Union[DatetimeLikeArrayT, DTScalarOrNaT]", super().__getitem__(key)
    342 )
    343 if lib.is_scalar(result):
    344     return result

File c:\Users\minda\.conda\envs\tf20\lib\site-packages\pandas\core\arrays\_mixins.py:280, in NDArrayBackedExtensionArray.__getitem__(self, key)
    277 # error: Incompatible types in assignment (expression has type "ExtensionArray",
    278 # variable has type "Union[int, slice, ndarray]")
    279 key = extract_array(key, extract_numpy=True)  # type: ignore[assignment]
--> 280 key = check_array_indexer(self, key)
    281 result = self._ndarray[key]
    282 if lib.is_scalar(result):

KeyboardInterrupt: 

Plot Payment Variables¶

In [ ]:
plot_time_series('P')
#########################
Plotting all 3 columns with prefix P
#########################

Plot Balance Variables¶

In [ ]:
plot_time_series('B',B_Feature_List)
#########################
Plotting 11 columns with prefix B
#########################

Plot Risk Variables¶

In [ ]:
plot_time_series('R',R_Feature_List)
#########################
Plotting 3 columns with prefix R
#########################

Summary¶

  • 장점 : 전체적인 데이터 패턴파악이 쉽다.
  • 단점 : 데이터들의 대략적인값만 알 수 있다. 결측치 여부파악하기 어렵다. 데이터분포의 정확한 비율알기 어렵다.
  • 특이사항 : 특성별로 특정패턴이 반복되어 나타남을 알 수 있다. 또한 몇몇 특성의 경우 파산, 비파산에 데이터의 패턴이 상이하여 중요한 변수로 판단된다.